{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "示例：使用Logistic回归估计马疝病的死亡率\n",
    "\n",
    "（1）收集数据：给定数据文件。\n",
    "（2）准备数据：用Python解析文本文件并填充缺失值。\n",
    "（3）分析数据：可视化并观察数据。\n",
    "（4）训练算法：使用优化算法，找到最佳的系数。\n",
    "（5）测试算法：为了量化回归的效果，需要观察错误率。\n",
    "根据错误率决定是否退回到训练阶段，通过改变迭代的次数和步长等参数来得到更好的回归系数。\n",
    "（6）使用算法：实现一个简单的命令行程序来收集马的症状并输出预测结果。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sigmoid(inX):\n",
    "    # return np.longfloat(1.0/(1+np.exp(-inX)))\n",
    "    if inX>=0:\n",
    "        return 1.0/(1+np.exp(-inX))\n",
    "    else:\n",
    "        return np.exp(inX)/(1+np.exp(inX))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明:改进的随机梯度上升算法\n",
    "\n",
    "Parameters:\n",
    "    dataMatrix - 训练数据数组\n",
    "    classlabels - 训练标签列表\n",
    "    numIter - 迭代次数\n",
    "Returns:\n",
    "    weights - 最佳回归系数\n",
    "\"\"\"\n",
    "def stocGradAscent1(dataMatrix, classLabels, numIter = 150):\n",
    "    m,n = np.shape(dataMatrix)          #返回dataMatrix的大小\n",
    "    weights = np.ones(n)                #初始化权重\n",
    "    for j in range(numIter):\n",
    "        dataIndex = list(range(m))\n",
    "        for i in range(m):\n",
    "            alpha = 4/(1.0+j+i)+0.01                                #降低alpha的大小\n",
    "            randIndex = int(np.random.uniform(0,len(dataIndex)))    #随机选择样本\n",
    "            h = sigmoid(sum(dataMatrix[randIndex]*weights))         #h是一个数值\n",
    "            error = classLabels[randIndex] - h\n",
    "            weights = weights + alpha*error*dataMatrix[randIndex]\n",
    "            del(dataIndex[randIndex])\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明:Logistic回归分类函数\n",
    "\n",
    "Parameters:\n",
    "    inX - 待分类的特征向量\n",
    "    weights - 训练好的回归系数\n",
    "Returns:\n",
    "    分类结果\n",
    "\"\"\"\n",
    "def classifyVector(inX, weights):\n",
    "    prob = sigmoid(sum(inX*weights))\n",
    "    if prob > 0.5: return 1.0\n",
    "    else: return 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明:Logistic回归分类测试函数\n",
    "\n",
    "Parameters:\n",
    "    无\n",
    "Returns:\n",
    "    errorRate - 错误率\n",
    "\"\"\"\n",
    "def colicTest():\n",
    "    frTrain = open('horseColicTraining.txt')        #打开训练集数据文件\n",
    "    frTest = open('horseColicTest.txt')             #打开测试集数据文件\n",
    "    trainingSet = []; trainingLabels = []           #存放训练数据集和测试数据集\n",
    "    for line in frTrain.readlines():\n",
    "        currLine = line.strip().split('\\t')\n",
    "        lineArr = []\n",
    "        for i in range(21):\n",
    "            lineArr.append(float(currLine[i]))\n",
    "        trainingSet.append(lineArr)                 #获取训练数据样本\n",
    "        trainingLabels.append(float(currLine[21]))  #获取训练数据样本标签\n",
    "    trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)  #通过logistic回归获得回归系数\n",
    "    errorCount = 0; numTestVec = 0.0\n",
    "    for line in frTest.readlines():\n",
    "        numTestVec += 1.0\n",
    "        currLine = line.strip().split('\\t')\n",
    "        lineArr = []\n",
    "        for i in range(21):\n",
    "            lineArr.append(float(currLine[i]))\n",
    "        if int(classifyVector(np.array(lineArr),trainWeights)) != int(currLine[21]):    #判断是否分类正确\n",
    "            errorCount +=1\n",
    "    errorRate = (float(errorCount/numTestVec))\n",
    "    print('the error rate of this test is: %.2f' % errorRate)\n",
    "    return errorRate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明:Logistic回归分类测试函数\n",
    "进行多次\n",
    "Parameters:\n",
    "    无\n",
    "Returns:\n",
    "    无\n",
    "\"\"\"\n",
    "def multiTest():\n",
    "    numTests = 10; errorSum = 0.0\n",
    "    for k in range(numTests):\n",
    "        errorSum += colicTest()\n",
    "    print('after %d iterations the average error rate is: %.2f' % (numTests, errorSum/float(numTests)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate of this test is: 0.42\n",
      "the error rate of this test is: 0.40\n",
      "the error rate of this test is: 0.37\n",
      "the error rate of this test is: 0.34\n",
      "the error rate of this test is: 0.37\n",
      "the error rate of this test is: 0.34\n",
      "the error rate of this test is: 0.39\n",
      "the error rate of this test is: 0.34\n",
      "the error rate of this test is: 0.40\n",
      "the error rate of this test is: 0.40\n",
      "after 10 iterations the average error rate is: 0.38\n"
     ]
    }
   ],
   "source": [
    "multiTest()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
